OVERVIEW

This exercise accompanies the lessons in Environmental Data Analytics on Data Wrangling

Set up your session

  1. Check your working directory, load the tidyverse and lubridate packages, and upload all four raw data files associated with the EPA Air dataset, being sure to set string columns to be read in a factors. See the README file for the EPA air datasets for more information (especially if you have not worked with air quality data previously).
# Load packages

library(tidyverse)
library(lubridate)
library(skimr)

# Check working directory

getwd()
## [1] "C:/Users/sasho/Desktop/Environ Data Analytics/Env872 Workspace/EDA-Fall2022_SM/Assignments/Assignment 4"
# Load data

EPA.Air.NC18 <- read.csv("./EPAair_O3_NC2018_raw.csv", stringsAsFactors = TRUE)
EPA.Air.NC19 <- read.csv("./EPAair_O3_NC2019_raw.csv", stringsAsFactors = TRUE)
EPA.Air.PM25_NC18 <- read.csv("./EPAair_PM25_NC2018_raw.csv", stringsAsFactors = TRUE)
EPA.Air.PM25_NC19 <- read.csv("./EPAair_PM25_NC2019_raw.csv", stringsAsFactors = TRUE)
  1. Explore the dimensions, column names, and structure of the datasets.
# 1. Air Data

############# Explore dimensions #####################

dim(EPA.Air.NC18)
## [1] 9737   20
dim(EPA.Air.NC19)
## [1] 10592    20
############# Explore column names and structure of databases ###########

summary(EPA.Air.NC18)
##          Date      Source        Site.ID               POC   
##  04/01/2018:  40   AQS:9737   Min.   :370030005   Min.   :1  
##  04/12/2018:  40              1st Qu.:370650099   1st Qu.:1  
##  04/13/2018:  40              Median :371010002   Median :1  
##  04/14/2018:  40              Mean   :370969118   Mean   :1  
##  04/15/2018:  40              3rd Qu.:371290002   3rd Qu.:1  
##  04/18/2018:  40              Max.   :371990004   Max.   :1  
##  (Other)   :9497                                             
##  Daily.Max.8.hour.Ozone.Concentration UNITS      DAILY_AQI_VALUE 
##  Min.   :0.00200                      ppm:9737   Min.   :  2.00  
##  1st Qu.:0.03400                                 1st Qu.: 31.00  
##  Median :0.04200                                 Median : 39.00  
##  Mean   :0.04194                                 Mean   : 40.22  
##  3rd Qu.:0.04900                                 3rd Qu.: 45.00  
##  Max.   :0.07700                                 Max.   :122.00  
##                                                                  
##                 Site.Name    DAILY_OBS_COUNT PERCENT_COMPLETE
##  Coweeta             : 355   Min.   :12.00   Min.   : 71.00  
##  Garinger High School: 354   1st Qu.:17.00   1st Qu.:100.00  
##  Millbrook School    : 352   Median :17.00   Median :100.00  
##  Candor              : 335   Mean   :16.94   Mean   : 99.65  
##  Rockwell            : 335   3rd Qu.:17.00   3rd Qu.:100.00  
##  Cranberry           : 323   Max.   :17.00   Max.   :100.00  
##  (Other)             :7683                                   
##  AQS_PARAMETER_CODE AQS_PARAMETER_DESC   CBSA_CODE    
##  Min.   :44201      Ozone:9737         Min.   :11700  
##  1st Qu.:44201                         1st Qu.:16740  
##  Median :44201                         Median :24660  
##  Mean   :44201                         Mean   :27247  
##  3rd Qu.:44201                         3rd Qu.:39580  
##  Max.   :44201                         Max.   :49180  
##                                        NA's   :2609   
##                              CBSA_NAME      STATE_CODE            STATE     
##                                   :2609   Min.   :37   North Carolina:9737  
##  Charlotte-Concord-Gastonia, NC-SC:1338   1st Qu.:37                        
##  Asheville, NC                    : 927   Median :37                        
##  Winston-Salem, NC                : 725   Mean   :37                        
##  Raleigh, NC                      : 585   3rd Qu.:37                        
##  Hickory-Lenoir-Morganton, NC     : 477   Max.   :37                        
##  (Other)                          :3076                                     
##   COUNTY_CODE             COUNTY     SITE_LATITUDE   SITE_LONGITUDE  
##  Min.   :  3.00   Forsyth    : 725   Min.   :34.36   Min.   :-83.80  
##  1st Qu.: 65.00   Haywood    : 683   1st Qu.:35.26   1st Qu.:-82.05  
##  Median :101.00   Mecklenburg: 592   Median :35.55   Median :-80.34  
##  Mean   : 96.78   Avery      : 558   Mean   :35.62   Mean   :-80.42  
##  3rd Qu.:129.00   Swain      : 483   3rd Qu.:36.03   3rd Qu.:-78.90  
##  Max.   :199.00   Cumberland : 444   Max.   :36.31   Max.   :-76.62  
##                   (Other)    :6252
skim(EPA.Air.NC18)
## Warning in sorted_count(x): Variable contains value(s) of "" that have been
## converted to "empty".

## Warning in sorted_count(x): Variable contains value(s) of "" that have been
## converted to "empty".
Data summary
Name EPA.Air.NC18
Number of rows 9737
Number of columns 20
_______________________
Column type frequency:
factor 8
numeric 12
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Date 0 1 FALSE 364 04/: 40, 04/: 40, 04/: 40, 04/: 40
Source 0 1 FALSE 1 AQS: 9737
UNITS 0 1 FALSE 1 ppm: 9737
Site.Name 0 1 FALSE 40 Cow: 355, Gar: 354, Mil: 352, Can: 335
AQS_PARAMETER_DESC 0 1 FALSE 1 Ozo: 9737
CBSA_NAME 0 1 FALSE 17 emp: 2609, Cha: 1338, Ash: 927, Win: 725
STATE 0 1 FALSE 1 Nor: 9737
COUNTY 0 1 FALSE 32 For: 725, Hay: 683, Mec: 592, Ave: 558

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Site.ID 0 1.00 370969118.02 519741.72 3.7003e+08 370650099.00 3.7101e+08 3.7129e+08 3.7199e+08 ▅▆▇▅▃
POC 0 1.00 1.00 0.00 1.0000e+00 1.00 1.0000e+00 1.0000e+00 1.0000e+00 ▁▁▇▁▁
Daily.Max.8.hour.Ozone.Concentration 0 1.00 0.04 0.01 0.0000e+00 0.03 4.0000e-02 5.0000e-02 8.0000e-02 ▁▃▇▅▁
DAILY_AQI_VALUE 0 1.00 40.22 13.40 2.0000e+00 31.00 3.9000e+01 4.5000e+01 1.2200e+02 ▁▇▁▁▁
DAILY_OBS_COUNT 0 1.00 16.94 0.40 1.2000e+01 17.00 1.7000e+01 1.7000e+01 1.7000e+01 ▁▁▁▁▇
PERCENT_COMPLETE 0 1.00 99.65 2.38 7.1000e+01 100.00 1.0000e+02 1.0000e+02 1.0000e+02 ▁▁▁▁▇
AQS_PARAMETER_CODE 0 1.00 44201.00 0.00 4.4201e+04 44201.00 4.4201e+04 4.4201e+04 4.4201e+04 ▁▁▇▁▁
CBSA_CODE 2609 0.73 27246.97 12267.89 1.1700e+04 16740.00 2.4660e+04 3.9580e+04 4.9180e+04 ▇▇▂▃▃
STATE_CODE 0 1.00 37.00 0.00 3.7000e+01 37.00 3.7000e+01 3.7000e+01 3.7000e+01 ▁▁▇▁▁
COUNTY_CODE 0 1.00 96.78 52.03 3.0000e+00 65.00 1.0100e+02 1.2900e+02 1.9900e+02 ▅▆▇▅▃
SITE_LATITUDE 0 1.00 35.62 0.43 3.4360e+01 35.26 3.5550e+01 3.6030e+01 3.6310e+01 ▁▂▇▅▇
SITE_LONGITUDE 0 1.00 -80.42 1.97 -8.3800e+01 -82.05 -8.0340e+01 -7.8900e+01 -7.6620e+01 ▆▅▇▆▅
head(EPA.Air.NC18)
tail(EPA.Air.NC18)
#########################

summary(EPA.Air.NC19)
##          Date          Source        Site.ID               POC   
##  03/18/2019:   38   AirNow:2126   Min.   :370030005   Min.   :1  
##  03/19/2019:   38   AQS   :8466   1st Qu.:370630015   1st Qu.:1  
##  03/20/2019:   38                 Median :370870036   Median :1  
##  03/23/2019:   38                 Mean   :370960317   Mean   :1  
##  03/24/2019:   38                 3rd Qu.:371290002   3rd Qu.:1  
##  03/25/2019:   38                 Max.   :371990004   Max.   :1  
##  (Other)   :10364                                                
##  Daily.Max.8.hour.Ozone.Concentration UNITS       DAILY_AQI_VALUE
##  Min.   :0.00000                      ppm:10592   Min.   :  0.0  
##  1st Qu.:0.03600                                  1st Qu.: 33.0  
##  Median :0.04400                                  Median : 41.0  
##  Mean   :0.04331                                  Mean   : 41.2  
##  3rd Qu.:0.05000                                  3rd Qu.: 46.0  
##  Max.   :0.08100                                  Max.   :136.0  
##                                                                  
##                 Site.Name    DAILY_OBS_COUNT PERCENT_COMPLETE
##  Garinger High School: 363   Min.   :13.00   Min.   : 75.00  
##  Millbrook School    : 362   1st Qu.:17.00   1st Qu.:100.00  
##  Coweeta             : 361   Median :17.00   Median :100.00  
##  Rockwell            : 361   Mean   :18.34   Mean   : 99.69  
##  Candor              : 358   3rd Qu.:17.00   3rd Qu.:100.00  
##  Cranberry           : 351   Max.   :24.00   Max.   :100.00  
##  (Other)             :8436                                   
##  AQS_PARAMETER_CODE AQS_PARAMETER_DESC   CBSA_CODE    
##  Min.   :44201      Ozone:10592        Min.   :11700  
##  1st Qu.:44201                         1st Qu.:16740  
##  Median :44201                         Median :24660  
##  Mean   :44201                         Mean   :26617  
##  3rd Qu.:44201                         3rd Qu.:37080  
##  Max.   :44201                         Max.   :49180  
##                                        NA's   :2852   
##                              CBSA_NAME      STATE_CODE            STATE      
##                                   :2852   Min.   :37   North Carolina:10592  
##  Charlotte-Concord-Gastonia, NC-SC:1590   1st Qu.:37                         
##  Asheville, NC                    :1114   Median :37                         
##  Winston-Salem, NC                : 735   Mean   :37                         
##  Raleigh, NC                      : 646   3rd Qu.:37                         
##  Hickory-Lenoir-Morganton, NC     : 567   Max.   :37                         
##  (Other)                          :3088                                      
##   COUNTY_CODE            COUNTY     SITE_LATITUDE   SITE_LONGITUDE  
##  Min.   :  3.0   Haywood    : 864   Min.   :34.36   Min.   :-83.80  
##  1st Qu.: 63.0   Forsyth    : 735   1st Qu.:35.26   1st Qu.:-82.05  
##  Median : 87.0   Mecklenburg: 657   Median :35.59   Median :-80.34  
##  Mean   : 95.9   Avery      : 607   Mean   :35.61   Mean   :-80.41  
##  3rd Qu.:129.0   Cumberland : 498   3rd Qu.:36.03   3rd Qu.:-78.77  
##  Max.   :199.0   Swain      : 476   Max.   :36.31   Max.   :-76.62  
##                  (Other)    :6755
skim(EPA.Air.NC19)
## Warning in sorted_count(x): Variable contains value(s) of "" that have been
## converted to "empty".

## Warning in sorted_count(x): Variable contains value(s) of "" that have been
## converted to "empty".
Data summary
Name EPA.Air.NC19
Number of rows 10592
Number of columns 20
_______________________
Column type frequency:
factor 8
numeric 12
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Date 0 1 FALSE 365 03/: 38, 03/: 38, 03/: 38, 03/: 38
Source 0 1 FALSE 2 AQS: 8466, Air: 2126
UNITS 0 1 FALSE 1 ppm: 10592
Site.Name 0 1 FALSE 38 Gar: 363, Mil: 362, Cow: 361, Roc: 361
AQS_PARAMETER_DESC 0 1 FALSE 1 Ozo: 10592
CBSA_NAME 0 1 FALSE 15 emp: 2852, Cha: 1590, Ash: 1114, Win: 735
STATE 0 1 FALSE 1 Nor: 10592
COUNTY 0 1 FALSE 30 Hay: 864, For: 735, Mec: 657, Ave: 607

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Site.ID 0 1.00 370960316.99 525393.35 3.7003e+08 3.7063e+08 3.7087e+08 3.7129e+08 3.7199e+08 ▅▇▇▅▃
POC 0 1.00 1.00 0.00 1.0000e+00 1.0000e+00 1.0000e+00 1.0000e+00 1.0000e+00 ▁▁▇▁▁
Daily.Max.8.hour.Ozone.Concentration 0 1.00 0.04 0.01 0.0000e+00 4.0000e-02 4.0000e-02 5.0000e-02 8.0000e-02 ▁▂▇▅▁
DAILY_AQI_VALUE 0 1.00 41.20 11.88 0.0000e+00 3.3000e+01 4.1000e+01 4.6000e+01 1.3600e+02 ▁▇▁▁▁
DAILY_OBS_COUNT 0 1.00 18.34 2.77 1.3000e+01 1.7000e+01 1.7000e+01 1.7000e+01 2.4000e+01 ▁▇▁▁▂
PERCENT_COMPLETE 0 1.00 99.69 2.38 7.5000e+01 1.0000e+02 1.0000e+02 1.0000e+02 1.0000e+02 ▁▁▁▁▇
AQS_PARAMETER_CODE 0 1.00 44201.00 0.00 4.4201e+04 4.4201e+04 4.4201e+04 4.4201e+04 4.4201e+04 ▁▁▇▁▁
CBSA_CODE 2852 0.73 26616.75 12146.59 1.1700e+04 1.6740e+04 2.4660e+04 3.7080e+04 4.9180e+04 ▇▇▂▃▃
STATE_CODE 0 1.00 37.00 0.00 3.7000e+01 3.7000e+01 3.7000e+01 3.7000e+01 3.7000e+01 ▁▁▇▁▁
COUNTY_CODE 0 1.00 95.90 52.60 3.0000e+00 6.3000e+01 8.7000e+01 1.2900e+02 1.9900e+02 ▅▇▇▅▃
SITE_LATITUDE 0 1.00 35.61 0.45 3.4360e+01 3.5260e+01 3.5590e+01 3.6030e+01 3.6310e+01 ▁▂▇▆▇
SITE_LONGITUDE 0 1.00 -80.41 1.98 -8.3800e+01 -8.2050e+01 -8.0340e+01 -7.8770e+01 -7.6620e+01 ▆▅▇▆▅
head(EPA.Air.NC19)
tail(EPA.Air.NC19)
# 2. PM 2.5 Data

############# Explore dimensions #####################

dim(EPA.Air.PM25_NC18)
## [1] 8983   20
dim(EPA.Air.PM25_NC19)
## [1] 8581   20
############# Explore column names and structure of databases ###########

summary(EPA.Air.PM25_NC18)
##          Date      Source        Site.ID               POC       
##  01/26/2018:  40   AQS:8983   Min.   :370110002   Min.   :1.000  
##  02/01/2018:  40              1st Qu.:370630015   1st Qu.:3.000  
##  02/19/2018:  40              Median :371010002   Median :3.000  
##  03/21/2018:  40              Mean   :371002405   Mean   :2.812  
##  04/02/2018:  40              3rd Qu.:371230001   3rd Qu.:3.000  
##  04/08/2018:  40              Max.   :371830021   Max.   :5.000  
##  (Other)   :8743                                                 
##  Daily.Mean.PM2.5.Concentration      UNITS      DAILY_AQI_VALUE
##  Min.   :-2.300                 ug/m3 LC:8983   Min.   : 0.00  
##  1st Qu.: 4.900                                 1st Qu.:20.00  
##  Median : 7.000                                 Median :29.00  
##  Mean   : 7.491                                 Mean   :30.73  
##  3rd Qu.: 9.700                                 3rd Qu.:40.00  
##  Max.   :34.200                                 Max.   :97.00  
##                                                                
##                 Site.Name    DAILY_OBS_COUNT PERCENT_COMPLETE
##  Millbrook School    : 717   Min.   :1       Min.   :100     
##  Hattie Avenue       : 510   1st Qu.:1       1st Qu.:100     
##  Board Of Ed. Bldg.  : 477   Median :1       Median :100     
##  Garinger High School: 472   Mean   :1       Mean   :100     
##  Durham Armory       : 466   3rd Qu.:1       3rd Qu.:100     
##  Pitt Agri. Center   : 460   Max.   :1       Max.   :100     
##  (Other)             :5881                                   
##  AQS_PARAMETER_CODE                              AQS_PARAMETER_DESC
##  Min.   :88101      Acceptable PM2.5 AQI & Speciation Mass:1403    
##  1st Qu.:88101      PM2.5 - Local Conditions              :7580    
##  Median :88101                                                     
##  Mean   :88164                                                     
##  3rd Qu.:88101                                                     
##  Max.   :88502                                                     
##                                                                    
##    CBSA_CODE                                 CBSA_NAME      STATE_CODE
##  Min.   :11700   Raleigh, NC                      :1396   Min.   :37  
##  1st Qu.:19000   Winston-Salem, NC                :1316   1st Qu.:37  
##  Median :25860   Charlotte-Concord-Gastonia, NC-SC:1275   Median :37  
##  Mean   :30946                                    :1263   Mean   :37  
##  3rd Qu.:40580   Asheville, NC                    : 586   3rd Qu.:37  
##  Max.   :49180   Durham-Chapel Hill, NC           : 466   Max.   :37  
##  NA's   :1263    (Other)                          :2681               
##             STATE       COUNTY_CODE            COUNTY     SITE_LATITUDE  
##  North Carolina:8983   Min.   : 11.0   Mecklenburg:1275   Min.   :34.36  
##                        1st Qu.: 63.0   Wake       :1049   1st Qu.:35.26  
##                        Median :101.0   Forsyth    : 876   Median :35.64  
##                        Mean   :100.2   Buncombe   : 477   Mean   :35.61  
##                        3rd Qu.:123.0   Durham     : 466   3rd Qu.:35.91  
##                        Max.   :183.0   Pitt       : 460   Max.   :36.11  
##                                        (Other)    :4380                  
##  SITE_LONGITUDE  
##  Min.   :-83.44  
##  1st Qu.:-80.87  
##  Median :-80.23  
##  Mean   :-79.99  
##  3rd Qu.:-78.57  
##  Max.   :-76.21  
## 
skim(EPA.Air.PM25_NC18)
## Warning in sorted_count(x): Variable contains value(s) of "" that have been
## converted to "empty".

## Warning in sorted_count(x): Variable contains value(s) of "" that have been
## converted to "empty".
Data summary
Name EPA.Air.PM25_NC18
Number of rows 8983
Number of columns 20
_______________________
Column type frequency:
factor 8
numeric 12
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Date 0 1 FALSE 365 01/: 40, 02/: 40, 02/: 40, 03/: 40
Source 0 1 FALSE 1 AQS: 8983
UNITS 0 1 FALSE 1 ug/: 8983
Site.Name 0 1 FALSE 25 Mil: 717, Hat: 510, Boa: 477, Gar: 472
AQS_PARAMETER_DESC 0 1 FALSE 2 PM2: 7580, Acc: 1403
CBSA_NAME 0 1 FALSE 14 Ral: 1396, Win: 1316, Cha: 1275, emp: 1263
STATE 0 1 FALSE 1 Nor: 8983
COUNTY 0 1 FALSE 21 Mec: 1275, Wak: 1049, For: 876, Bun: 477

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Site.ID 0 1.00 371002405.04 480527.22 3.7011e+08 3.7063e+08 3.7101e+08 3.7123e+08 3.7183e+08 ▃▇▃▇▅
POC 0 1.00 2.81 0.91 1.0000e+00 3.0000e+00 3.0000e+00 3.0000e+00 5.0000e+00 ▂▁▇▁▁
Daily.Mean.PM2.5.Concentration 0 1.00 7.49 3.76 -2.3000e+00 4.9000e+00 7.0000e+00 9.7000e+00 3.4200e+01 ▃▇▁▁▁
DAILY_AQI_VALUE 0 1.00 30.73 14.37 0.0000e+00 2.0000e+01 2.9000e+01 4.0000e+01 9.7000e+01 ▃▇▅▁▁
DAILY_OBS_COUNT 0 1.00 1.00 0.00 1.0000e+00 1.0000e+00 1.0000e+00 1.0000e+00 1.0000e+00 ▁▁▇▁▁
PERCENT_COMPLETE 0 1.00 100.00 0.00 1.0000e+02 1.0000e+02 1.0000e+02 1.0000e+02 1.0000e+02 ▁▁▇▁▁
AQS_PARAMETER_CODE 0 1.00 88163.63 145.58 8.8101e+04 8.8101e+04 8.8101e+04 8.8101e+04 8.8502e+04 ▇▁▁▁▂
CBSA_CODE 1263 0.86 30945.83 13099.36 1.1700e+04 1.9000e+04 2.5860e+04 4.0580e+04 4.9180e+04 ▇▇▁▆▇
STATE_CODE 0 1.00 37.00 0.00 3.7000e+01 3.7000e+01 3.7000e+01 3.7000e+01 3.7000e+01 ▁▁▇▁▁
COUNTY_CODE 0 1.00 100.23 48.05 1.1000e+01 6.3000e+01 1.0100e+02 1.2300e+02 1.8300e+02 ▃▇▃▇▅
SITE_LATITUDE 0 1.00 35.61 0.40 3.4360e+01 3.5260e+01 3.5640e+01 3.5910e+01 3.6110e+01 ▁▁▃▅▇
SITE_LONGITUDE 0 1.00 -79.99 1.70 -8.3440e+01 -8.0870e+01 -8.0230e+01 -7.8570e+01 -7.6210e+01 ▅▆▇▇▃
head(EPA.Air.PM25_NC18)
tail(EPA.Air.PM25_NC18)
#########################

summary(EPA.Air.PM25_NC19)
##          Date         Source        Site.ID               POC       
##  02/26/2019:  41   AirNow:1670   Min.   :370110002   Min.   :1.000  
##  01/21/2019:  40   AQS   :6911   1st Qu.:370630015   1st Qu.:3.000  
##  02/14/2019:  40                 Median :371190041   Median :3.000  
##  01/09/2019:  39                 Mean   :371023743   Mean   :3.032  
##  01/27/2019:  39                 3rd Qu.:371290002   3rd Qu.:3.000  
##  02/02/2019:  39                 Max.   :371830021   Max.   :5.000  
##  (Other)   :8343                                                    
##  Daily.Mean.PM2.5.Concentration      UNITS      DAILY_AQI_VALUE
##  Min.   :-3.100                 ug/m3 LC:8581   Min.   : 0.00  
##  1st Qu.: 4.900                                 1st Qu.:20.00  
##  Median : 7.400                                 Median :31.00  
##  Mean   : 7.684                                 Mean   :31.51  
##  3rd Qu.:10.100                                 3rd Qu.:42.00  
##  Max.   :31.200                                 Max.   :91.00  
##                                                                
##                 Site.Name    DAILY_OBS_COUNT PERCENT_COMPLETE
##  Millbrook School    : 738   Min.   :1       Min.   :100     
##  Garinger High School: 629   1st Qu.:1       1st Qu.:100     
##  Remount             : 573   Median :1       Median :100     
##  Hickory Water Tower : 518   Mean   :1       Mean   :100     
##  Hattie Avenue       : 436   3rd Qu.:1       3rd Qu.:100     
##  Durham Armory       : 431   Max.   :1       Max.   :100     
##  (Other)             :5256                                   
##  AQS_PARAMETER_CODE                              AQS_PARAMETER_DESC
##  Min.   :88101      Acceptable PM2.5 AQI & Speciation Mass:1029    
##  1st Qu.:88101      PM2.5 - Local Conditions              :7552    
##  Median :88101                                                     
##  Mean   :88149                                                     
##  3rd Qu.:88101                                                     
##  Max.   :88502                                                     
##                                                                    
##    CBSA_CODE                                 CBSA_NAME      STATE_CODE
##  Min.   :11700   Raleigh, NC                      :1441   Min.   :37  
##  1st Qu.:19000   Charlotte-Concord-Gastonia, NC-SC:1379   1st Qu.:37  
##  Median :25860   Winston-Salem, NC                :1235   Median :37  
##  Mean   :31099                                    :1058   Mean   :37  
##  3rd Qu.:40580   Hickory-Lenoir-Morganton, NC     : 518   3rd Qu.:37  
##  Max.   :49180   Durham-Chapel Hill, NC           : 431   Max.   :37  
##  NA's   :1058    (Other)                          :2519               
##             STATE       COUNTY_CODE            COUNTY     SITE_LATITUDE  
##  North Carolina:8581   Min.   : 11.0   Mecklenburg:1379   Min.   :34.36  
##                        1st Qu.: 63.0   Wake       :1083   1st Qu.:35.26  
##                        Median :119.0   Forsyth    : 839   Median :35.73  
##                        Mean   :102.4   Catawba    : 518   Mean   :35.63  
##                        3rd Qu.:129.0   Durham     : 431   3rd Qu.:35.91  
##                        Max.   :183.0   Cumberland : 427   Max.   :36.51  
##                                        (Other)    :3904                  
##  SITE_LONGITUDE  
##  Min.   :-83.44  
##  1st Qu.:-80.87  
##  Median :-80.23  
##  Mean   :-79.95  
##  3rd Qu.:-78.57  
##  Max.   :-76.21  
## 
skim(EPA.Air.PM25_NC19)
## Warning in sorted_count(x): Variable contains value(s) of "" that have been
## converted to "empty".

## Warning in sorted_count(x): Variable contains value(s) of "" that have been
## converted to "empty".
Data summary
Name EPA.Air.PM25_NC19
Number of rows 8581
Number of columns 20
_______________________
Column type frequency:
factor 8
numeric 12
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Date 0 1 FALSE 365 02/: 41, 01/: 40, 02/: 40, 01/: 39
Source 0 1 FALSE 2 AQS: 6911, Air: 1670
UNITS 0 1 FALSE 1 ug/: 8581
Site.Name 0 1 FALSE 25 Mil: 738, Gar: 629, Rem: 573, Hic: 518
AQS_PARAMETER_DESC 0 1 FALSE 2 PM2: 7552, Acc: 1029
CBSA_NAME 0 1 FALSE 14 Ral: 1441, Cha: 1379, Win: 1235, emp: 1058
STATE 0 1 FALSE 1 Nor: 8581
COUNTY 0 1 FALSE 21 Mec: 1379, Wak: 1083, For: 839, Cat: 518

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Site.ID 0 1.00 371023743.50 481571.87 3.7011e+08 3.7063e+08 3.7119e+08 3.7129e+08 3.7183e+08 ▂▇▂▇▃
POC 0 1.00 3.03 0.78 1.0000e+00 3.0000e+00 3.0000e+00 3.0000e+00 5.0000e+00 ▁▁▇▂▁
Daily.Mean.PM2.5.Concentration 0 1.00 7.68 3.83 -3.1000e+00 4.9000e+00 7.4000e+00 1.0100e+01 3.1200e+01 ▂▇▂▁▁
DAILY_AQI_VALUE 0 1.00 31.51 14.67 0.0000e+00 2.0000e+01 3.1000e+01 4.2000e+01 9.1000e+01 ▃▇▆▁▁
DAILY_OBS_COUNT 0 1.00 1.00 0.00 1.0000e+00 1.0000e+00 1.0000e+00 1.0000e+00 1.0000e+00 ▁▁▇▁▁
PERCENT_COMPLETE 0 1.00 100.00 0.00 1.0000e+02 1.0000e+02 1.0000e+02 1.0000e+02 1.0000e+02 ▁▁▇▁▁
AQS_PARAMETER_CODE 0 1.00 88149.09 130.28 8.8101e+04 8.8101e+04 8.8101e+04 8.8101e+04 8.8502e+04 ▇▁▁▁▁
CBSA_CODE 1058 0.88 31098.81 12833.57 1.1700e+04 1.9000e+04 2.5860e+04 4.0580e+04 4.9180e+04 ▇▇▁▇▆
STATE_CODE 0 1.00 37.00 0.00 3.7000e+01 3.7000e+01 3.7000e+01 3.7000e+01 3.7000e+01 ▁▁▇▁▁
COUNTY_CODE 0 1.00 102.37 48.16 1.1000e+01 6.3000e+01 1.1900e+02 1.2900e+02 1.8300e+02 ▂▇▂▇▃
SITE_LATITUDE 0 1.00 35.63 0.43 3.4360e+01 3.5260e+01 3.5730e+01 3.5910e+01 3.6510e+01 ▁▂▆▇▂
SITE_LONGITUDE 0 1.00 -79.95 1.62 -8.3440e+01 -8.0870e+01 -8.0230e+01 -7.8570e+01 -7.6210e+01 ▃▆▆▇▂
head(EPA.Air.PM25_NC19)
tail(EPA.Air.PM25_NC19)

Wrangle individual datasets to create processed files.

  1. Change date to date

  2. Select the following columns: Date, DAILY_AQI_VALUE, Site.Name, AQS_PARAMETER_DESC, COUNTY, SITE_LATITUDE, SITE_LONGITUDE

  3. For the PM2.5 datasets, fill all cells in AQS_PARAMETER_DESC with “PM2.5” (all cells in this column should be identical).

  4. Save all four processed datasets in the Processed folder. Use the same file names as the raw files but replace “raw” with “processed”.

# 3. Change date to date

########################### Air Data #########################

class(EPA.Air.NC18$Date)  #Check class of variable
## [1] "factor"
EPA.Air.NC18$Date <- mdy(EPA.Air.NC18$Date)
class(EPA.Air.NC18$Date)  #Check change of class
## [1] "Date"
View(EPA.Air.NC18)

#########################

class(EPA.Air.NC19$Date)  #Check class of variable
## [1] "factor"
EPA.Air.NC19$Date <- mdy(EPA.Air.NC19$Date)
class(EPA.Air.NC19$Date)  #Check change of class
## [1] "Date"
View(EPA.Air.NC19)

########################## PM 2.5 ##########################

class(EPA.Air.PM25_NC18$Date)  #Check class of variable
## [1] "factor"
EPA.Air.PM25_NC18$Date <- mdy(EPA.Air.PM25_NC18$Date)
class(EPA.Air.PM25_NC18$Date)  #Check change of class
## [1] "Date"
View(EPA.Air.PM25_NC18)
#########################

class(EPA.Air.PM25_NC19$Date)  #Check class of variable
## [1] "factor"
EPA.Air.PM25_NC19$Date <- mdy(EPA.Air.PM25_NC19$Date)
class(EPA.Air.PM25_NC19$Date)  #Check change of class
## [1] "Date"
View(EPA.Air.PM25_NC19)

# 4. Select Columns

# Select the following columns: Date, DAILY_AQI_VALUE, Site.Name,
# AQS_PARAMETER_DESC, COUNTY, SITE_LATITUDE, SITE_LONGITUDE

########################### Air Data #########################

processed_EPA.Air.NC18 <- EPA.Air.NC18[c("Date", "DAILY_AQI_VALUE", "Site.Name",
    "AQS_PARAMETER_DESC", "COUNTY", "SITE_LATITUDE", "SITE_LONGITUDE")]
processed_EPA.Air.NC18
##############

processed_EPA.Air.NC19 <- EPA.Air.NC19[c("Date", "DAILY_AQI_VALUE", "Site.Name",
    "AQS_PARAMETER_DESC", "COUNTY", "SITE_LATITUDE", "SITE_LONGITUDE")]
processed_EPA.Air.NC19
######################### PM 2.5 #################################

processed_EPA.Air.PM25_NC18 <- EPA.Air.PM25_NC18[c("Date", "DAILY_AQI_VALUE", "Site.Name",
    "AQS_PARAMETER_DESC", "COUNTY", "SITE_LATITUDE", "SITE_LONGITUDE")]
processed_EPA.Air.PM25_NC18
#################

processed_EPA.Air.PM25_NC19 <- EPA.Air.PM25_NC19[c("Date", "DAILY_AQI_VALUE", "Site.Name",
    "AQS_PARAMETER_DESC", "COUNTY", "SITE_LATITUDE", "SITE_LONGITUDE")]
processed_EPA.Air.PM25_NC19
# 5. Filling all cells in AQS_PARAMETER_DESC with 'PM2.5'

processed_EPA.Air.PM25_NC18$AQS_PARAMETER_DESC <- "PM2.5"

processed_EPA.Air.PM25_NC19$AQS_PARAMETER_DESC <- "PM2.5"

# 6. Save four processed data-sets in the Processed folder

write.csv(processed_EPA.Air.NC18, file = "C:/Users/sasho/Desktop/Environ Data Analytics/Env872 Workspace/EDA-Fall2022_SM/Data/Processed/processed_EPA.Air.NC18.csv",
    row.names = FALSE)

write.csv(processed_EPA.Air.NC18, file = "C:/Users/sasho/Desktop/Environ Data Analytics/Env872 Workspace/EDA-Fall2022_SM/Data/Processed/processed_EPA.Air.NC19.csv",
    row.names = FALSE)

write.csv(processed_EPA.Air.NC18, file = "C:/Users/sasho/Desktop/Environ Data Analytics/Env872 Workspace/EDA-Fall2022_SM/Data/Processed/processed_EPA.Air.PM25_NC18.csv",
    row.names = FALSE)

write.csv(processed_EPA.Air.NC18, file = "C:/Users/sasho/Desktop/Environ Data Analytics/Env872 Workspace/EDA-Fall2022_SM/Data/Processed/processed_EPA.Air.PM25_NC19.csv",
    row.names = FALSE)

Combine datasets

  1. Combine the four datasets with rbind. Make sure your column names are identical prior to running this code.

  2. Wrangle your new dataset with a pipe function (%>%) so that it fills the following conditions:

  1. Spread your datasets such that AQI values for ozone and PM2.5 are in separate columns. Each location on a specific date should now occupy only one row.

  2. Call up the dimensions of your new tidy dataset.

  3. Save your processed dataset with the following file name: “EPAair_O3_PM25_NC1718_Processed.csv”

# 7. Check that column names are the same

colnames(processed_EPA.Air.NC18)
## [1] "Date"               "DAILY_AQI_VALUE"    "Site.Name"         
## [4] "AQS_PARAMETER_DESC" "COUNTY"             "SITE_LATITUDE"     
## [7] "SITE_LONGITUDE"
colnames(processed_EPA.Air.NC19)
## [1] "Date"               "DAILY_AQI_VALUE"    "Site.Name"         
## [4] "AQS_PARAMETER_DESC" "COUNTY"             "SITE_LATITUDE"     
## [7] "SITE_LONGITUDE"
colnames(processed_EPA.Air.PM25_NC18)
## [1] "Date"               "DAILY_AQI_VALUE"    "Site.Name"         
## [4] "AQS_PARAMETER_DESC" "COUNTY"             "SITE_LATITUDE"     
## [7] "SITE_LONGITUDE"
colnames(processed_EPA.Air.PM25_NC19)
## [1] "Date"               "DAILY_AQI_VALUE"    "Site.Name"         
## [4] "AQS_PARAMETER_DESC" "COUNTY"             "SITE_LATITUDE"     
## [7] "SITE_LONGITUDE"
# Combine with rbind

combined_EPA.data <- rbind(processed_EPA.Air.NC18, processed_EPA.Air.NC19, processed_EPA.Air.PM25_NC18,
    processed_EPA.Air.PM25_NC19)

# 8. Using the pipe function

filtered_combined_EPA.data <- combined_EPA.data %>%
    filter(Site.Name %in% c("Linville Falls", "Durham Armory", "Leggett", "Hattie Avenue",
        "Clemmons Middle", "Mendenhall School", "Frying Pan Mountain", "West Johnston Co.",
        "Garinger High School", "Castle Hayne", "Pitt Agri. Center", "Bryson City",
        "Millbrook School")) %>%
    group_by(AQS_PARAMETER_DESC, Date, Site.Name, COUNTY) %>%
    summarise(DAILY_AQI_VALUE = mean(DAILY_AQI_VALUE), SITE_LATITUDE = mean(SITE_LATITUDE),
        SITE_LONGITUDE = mean(SITE_LONGITUDE)) %>%
    mutate(Month = month(Date), Year = year(Date))
## `summarise()` has grouped output by 'AQS_PARAMETER_DESC', 'Date', 'Site.Name'.
## You can override using the `.groups` argument.
# 9. Spreading Data sets

filtered_combined_EPA.data2 <- filtered_combined_EPA.data %>%
    pivot_wider(names_from = AQS_PARAMETER_DESC, values_from = DAILY_AQI_VALUE)

filtered_combined_EPA.data2  #Check spreading
# 10. Call up the dimensions of tidy dataset

dim(filtered_combined_EPA.data2)
## [1] 8976    9
# 11. Save your dataset

write.csv(filtered_combined_EPA.data2, file = "C:/Users/sasho/Desktop/Environ Data Analytics/Env872 Workspace/EDA-Fall2022_SM/Data/Processed/EPAair_O3_PM25_NC1718_Processed.csv",
    row.names = FALSE)

Generate summary tables

  1. Use the split-apply-combine strategy to generate a summary data frame. Data should be grouped by site, month, and year. Generate the mean AQI values for ozone and PM2.5 for each group. Then, add a pipe to remove instances where a month and year are not available (use the function drop_na in your pipe).

  2. Call up the dimensions of the summary dataset.

# 12a + b

filtered_combined_EPA.data3 <- filtered_combined_EPA.data2 %>%
    group_by(Site.Name, Month, Year) %>%
    summarize(Ozone = mean(Ozone), PM2.5 = mean(PM2.5)) %>%
    drop_na(Ozone) %>%
    drop_na(PM2.5)
## `summarise()` has grouped output by 'Site.Name', 'Month'. You can override
## using the `.groups` argument.
filtered_combined_EPA.data3  # View data
# 13.

dim(filtered_combined_EPA.data3)
## [1] 101   5
  1. Why did we use the function drop_na rather than na.omit?

Answer: In this case, we wanted to remove all the NA values from the dataset instead of keeping the NA values but not incorporating them into our calculations.